clusterProfiler
supports over-representation test and gene set enrichment analysis of Gene Ontology. It supports GO annotation from OrgDb object, GMT file and user's own data.
support many species
In github version of clusterProfiler, enrichGO
and gseGO
functions removed the parameter organism and add another parameter OrgDb, so that any species that have OrgDb
object available can be analyzed in clusterProfiler. Bioconductor have already provide OrgDb for about 20 species, see
http://bioconductor.org/packages/release/BiocViews.html#___OrgDb, and users can build OrgDb
via AnnotationHub
.
library(AnnotationHub)
hub <- AnnotationHub()
# snapshotDate(): 2015-12-29
query(hub, "Cricetulus")
# AnnotationHub with 4 records
# # snapshotDate(): 2015-12-29
# # $dataprovider: UCSC, Inparanoid8, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
# # $species: Cricetulus griseus
# # $rdataclass: ChainFile, Inparanoid8Db, OrgDb, TwoBitFile
# # additional mcols(): taxonomyid, genome, description, tags,
# # sourceurl, sourcetype
# # retrieve records with, e.g., 'object[["AH10393"]]'
#
# title
# AH10393 | hom.Cricetulus_griseus.inp8.sqlite
# AH13980 | criGri1.2bit
# AH14346 | criGri1ToHg19.over.chain.gz
# AH48061 | org.Cricetulus_griseus.eg.sqlite
Cgriseus <- hub[["AH48061"]]
# loading from cache '/Users/guangchuangyu/.AnnotationHub/54367'
Cgriseus
# OrgDb object:
# | DBSCHEMAVERSION: 2.1
# | DBSCHEMA: NOSCHEMA_DB
# | ORGANISM: Cricetulus griseus
# | SPECIES: Cricetulus griseus
# | CENTRALID: GID
# | Taxonomy ID: 10029
# | Db type: OrgDb
# | Supporting package: AnnotationDbi
#
# Please see: help('select') for usage information
sample_gene <- sample(keys(Cgriseus), 100)
str(sample_gene)
# chr [1:100] "100762355" "100757285" "100773870" "100766902" ...
library(clusterProfiler)
sample_test <- enrichGO(sample_gene, OrgDb=Cgriseus, pvalueCutoff=1, qvalueCutoff=1)
head(summary(sample_test), 2)
# ID Description
# GO:0004983 GO:0004983 neuropeptide Y receptor activity
# GO:0005254 GO:0005254 chloride channel activity
# GeneRatio BgRatio pvalue p.adjust qvalue geneID
# GO:0004983 1/20 6/3946 0.03004660 0.6187407 0.6138746 100773047
# GO:0005254 1/20 6/3946 0.03004660 0.6187407 0.6138746 100773701
# Count
# GO:0004983 1
# GO:0005254 1
support many ID types
The input ID type can be any type that was supported in OrgDb
object.
library(org.Hs.eg.db)
data(geneList)
gene <- names(geneList)[abs(geneList) > 2]
gene.df <- bitr(gene, fromType = "ENTREZID",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = org.Hs.eg.db)
head(gene.df, 3)
# ENTREZID ENSEMBL SYMBOL
# 1 4312 ENSG00000196611 MMP1
# 2 8318 ENSG00000093009 CDC45
# 3 10874 ENSG00000109255 NMU
ego <- enrichGO(gene = gene,
universe = names(geneList),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego), 2)
# ID Description GeneRatio
# GO:0005819 GO:0005819 spindle 24/197
# GO:0005876 GO:0005876 spindle microtubule 11/197
# BgRatio pvalue p.adjust qvalue
# GO:0005819 222/11632 3.810608e-13 1.276554e-10 1.139171e-10
# GO:0005876 45/11632 1.527089e-10 2.557874e-08 2.282596e-08
# geneID
# GO:0005819 55143/991/9493/1062/259266/9787/220134/51203/22974/4751/983/4085/81930/332/3832/7272/9212/9055/3833/146909/10112/6790/891/24137
# GO:0005876 220134/51203/983/81930/332/3832/9212/9055/146909/6790/24137
# Count
# GO:0005819 24
# GO:0005876 11
ego2 <- enrichGO(gene = gene.df$ENSEMBL,
OrgDb = org.Hs.eg.db,
keytype = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego2), 2)
# ID Description GeneRatio
# GO:0005819 GO:0005819 spindle 28/220
# GO:0005875 GO:0005875 microtubule associated complex 19/220
# BgRatio pvalue p.adjust qvalue
# GO:0005819 298/19428 6.831911e-18 2.336514e-15 1.812254e-15
# GO:0005875 157/19428 1.710039e-14 2.924167e-12 2.268052e-12
# geneID
# GO:0005819 ENSG00000134690/ENSG00000117399/ENSG00000137807/ENSG00000138778/ENSG00000066279/ENSG00000126787/ENSG00000154839/ENSG00000262634/ENSG00000137804/ENSG00000088325/ENSG00000117650/ENSG00000170312/ENSG00000164109/ENSG00000121621/ENSG00000089685/ENSG00000138160/ENSG00000112742/ENSG00000178999/ENSG00000198901/ENSG00000237649/ENSG00000056678/ENSG00000233450/ENSG00000204197/ENSG00000186185/ENSG00000112984/ENSG00000087586/ENSG00000134057/ENSG00000090889
# GO:0005875 ENSG00000134690/ENSG00000137807/ENSG00000138778/ENSG00000121621/ENSG00000089685/ENSG00000138160/ENSG00000178999/ENSG00000237649/ENSG00000056678/ENSG00000233450/ENSG00000204197/ENSG00000186185/ENSG00000112984/ENSG00000087586/ENSG00000090889/ENSG00000186868/ENSG00000276155/ENSG00000277956/ENSG00000163879
# Count
# GO:0005819 28
# GO:0005875 19
ego3 <- enrichGO(gene = gene.df$SYMBOL,
OrgDb = org.Hs.eg.db,
keytype = 'SYMBOL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(summary(ego3), 2)
# ID Description GeneRatio
# GO:0005819 GO:0005819 spindle 24/196
# GO:0005876 GO:0005876 spindle microtubule 11/196
# BgRatio pvalue p.adjust qvalue
# GO:0005819 278/17761 6.023611e-15 2.042004e-12 1.769039e-12
# GO:0005876 52/17761 9.080301e-12 1.539111e-09 1.333370e-09
# geneID
# GO:0005819 CDCA8/CDC20/KIF23/CENPE/ASPM/DLGAP5/SKA1/NUSAP1/TPX2/NEK2/CDK1/MAD2L1/KIF18A/BIRC5/KIF11/TTK/AURKB/PRC1/KIFC1/KIF18B/KIF20A/AURKA/CCNB1/KIF4A
# GO:0005876 SKA1/NUSAP1/CDK1/KIF18A/BIRC5/KIF11/AURKB/PRC1/KIF18B/AURKA/KIF4A
# Count
# GO:0005819 24
# GO:0005876 11
Using SYMBOL
directly is not recommended. User can use setReadable
function to translate geneID
to gene symbol.
ego <- setReadable(ego, OrgDb = org.Hs.eg.db)
ego2 <- setReadable(ego2, OrgDb = org.Hs.eg.db)
head(summary(ego), n=3)
# ID Description GeneRatio BgRatio
# GO:0005819 GO:0005819 spindle 24/197 222/11632
# GO:0005876 GO:0005876 spindle microtubule 11/197 45/11632
# GO:0000793 GO:0000793 condensed chromosome 17/197 150/11632
# pvalue p.adjust qvalue
# GO:0005819 3.810608e-13 1.276554e-10 1.139171e-10
# GO:0005876 1.527089e-10 2.557874e-08 2.282596e-08
# GO:0000793 5.838332e-10 6.519471e-08 5.817847e-08
# geneID
# GO:0005819 CDCA8/CDC20/KIF23/CENPE/ASPM/DLGAP5/SKA1/NUSAP1/TPX2/NEK2/CDK1/MAD2L1/KIF18A/BIRC5/KIF11/TTK/AURKB/PRC1/KIFC1/KIF18B/KIF20A/AURKA/CCNB1/KIF4A
# GO:0005876 SKA1/NUSAP1/CDK1/KIF18A/BIRC5/KIF11/AURKB/PRC1/KIF18B/AURKA/KIF4A
# GO:0000793 CENPE/NDC80/TOP2A/NCAPH/HJURP/SKA1/NEK2/CENPM/CENPN/ERCC6L/MAD2L1/BIRC5/NCAPG/AURKB/CHEK1/AURKA/CCNB1
# Count
# GO:0005819 24
# GO:0005876 11
# GO:0000793 17
head(summary(ego2), n=3)
# ID Description GeneRatio BgRatio
# GO:0005819 GO:0005819 spindle 24/197 222/11632
# GO:0005876 GO:0005876 spindle microtubule 11/197 45/11632
# GO:0000793 GO:0000793 condensed chromosome 17/197 150/11632
# pvalue p.adjust qvalue
# GO:0005819 3.810608e-13 1.276554e-10 1.139171e-10
# GO:0005876 1.527089e-10 2.557874e-08 2.282596e-08
# GO:0000793 5.838332e-10 6.519471e-08 5.817847e-08
# geneID
# GO:0005819 CDCA8/CDC20/KIF23/CENPE/ASPM/DLGAP5/SKA1/NUSAP1/TPX2/NEK2/CDK1/MAD2L1/KIF18A/BIRC5/KIF11/TTK/AURKB/PRC1/KIFC1/KIF18B/KIF20A/AURKA/CCNB1/KIF4A
# GO:0005876 SKA1/NUSAP1/CDK1/KIF18A/BIRC5/KIF11/AURKB/PRC1/KIF18B/AURKA/KIF4A
# GO:0000793 CENPE/NDC80/TOP2A/NCAPH/HJURP/SKA1/NEK2/CENPM/CENPN/ERCC6L/MAD2L1/BIRC5/NCAPG/AURKB/CHEK1/AURKA/CCNB1
# Count
# GO:0005819 24
# GO:0005876 11
# GO:0000793 17
enrichGO
test the whole GO corpus and enriched result may contains very general terms. User can use dropGO
function to remove specific GO terms or GO level. If user want to restrict the result at sepcific GO level, they can use gofilter
function. We also provide a simplify
method to reduce redundancy of enriched GO terms, see the
post.
Visualization functions
dotplot(ego, showCategory=30)
enrichMap(ego, vertex.label.cex=1.2, layout=igraph::layout.kamada.kawai)
cnetplot(ego, foldChange=geneList)
plotGOgraph(ego)
#
# groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
#
# Building most specific GOs ..... ( 335 GO terms found. )
#
# Build GO DAG topology .......... ( 335 GO terms and 667 relations. )
#
# Annotating nodes ............... ( 11632 genes annotated to the GO terms. )
# $dag
# A graphNEL graph with directed edges
# Number of Nodes = 29
# Number of Edges = 50
#
# $complete.dag
# [1] "A graph with 29 nodes."
Gene Set Enrichment Analysis
gsecc <- gseGO(geneList=geneList, ont="CC", OrgDb=org.Hs.eg.db, verbose=F)
head(summary(gsecc))
# ID Description setSize
# GO:0031982 GO:0031982 vesicle 2880
# GO:0031988 GO:0031988 membrane-bounded vesicle 2791
# GO:0005576 GO:0005576 extracellular region 3296
# GO:0065010 GO:0065010 extracellular membrane-bounded organelle 2220
# GO:0070062 GO:0070062 extracellular exosome 2220
# GO:0044421 GO:0044421 extracellular region part 2941
# enrichmentScore NES pvalue p.adjust qvalues
# GO:0031982 -0.2561837 -1.222689 0.001002004 0.03721229 0.02816364
# GO:0031988 -0.2572169 -1.226003 0.001007049 0.03721229 0.02816364
# GO:0005576 -0.2746489 -1.312485 0.001009082 0.03721229 0.02816364
# GO:0065010 -0.2570342 -1.222048 0.001013171 0.03721229 0.02816364
# GO:0070062 -0.2570342 -1.222048 0.001013171 0.03721229 0.02816364
# GO:0044421 -0.2744658 -1.310299 0.001014199 0.03721229 0.02816364
gseaplot(gsecc, geneSetID="GO:0000779")
GO analysis using user's own data
clusterProfiler
provides enricher function for hypergeometric test and GSEA function for gene set enrichment analysis that are designed to accept user defined annotation. They accept two additional parameters TERM2GENE and TERM2NAME. As indicated in the parameter names, TERM2GENE is a data.frame with first column of term ID and second column of corresponding mapped gene and TERM2NAME is a data.frame with first column of term ID and second column of corresponding term name. TERM2NAME is optional.
An example of using enricher and GSEA to analyze DisGeNet annotation is presented in the post, use clusterProfiler as an universal enrichment analysis tool.
GMT files
We provides a function, read.gmt
, that can parse GMT file into a TERM2GENE data.frame that is ready for both enricher
and GSEA
functions.
gmtfile <- system.file("extdata", "c5.cc.v5.0.entrez.gmt", package="clusterProfiler")
c5 <- read.gmt(gmtfile)
egmt <- enricher(gene, TERM2GENE=c5)
head(summary(egmt))
# ID Description
# SPINDLE SPINDLE SPINDLE
# MICROTUBULE_CYTOSKELETON MICROTUBULE_CYTOSKELETON MICROTUBULE_CYTOSKELETON
# CYTOSKELETAL_PART CYTOSKELETAL_PART CYTOSKELETAL_PART
# SPINDLE_MICROTUBULE SPINDLE_MICROTUBULE SPINDLE_MICROTUBULE
# MICROTUBULE MICROTUBULE MICROTUBULE
# CYTOSKELETON CYTOSKELETON CYTOSKELETON
# GeneRatio BgRatio pvalue p.adjust
# SPINDLE 11/82 39/5270 7.667674e-12 6.594200e-10
# MICROTUBULE_CYTOSKELETON 16/82 152/5270 8.449298e-10 3.633198e-08
# CYTOSKELETAL_PART 15/82 235/5270 2.414879e-06 6.623386e-05
# SPINDLE_MICROTUBULE 5/82 16/5270 3.080645e-06 6.623386e-05
# MICROTUBULE 6/82 32/5270 7.740446e-06 1.331357e-04
# CYTOSKELETON 16/82 367/5270 1.308357e-04 1.826293e-03
# qvalue
# SPINDLE 5.327016e-10
# MICROTUBULE_CYTOSKELETON 2.935019e-08
# CYTOSKELETAL_PART 5.350593e-05
# SPINDLE_MICROTUBULE 5.350593e-05
# MICROTUBULE 1.075515e-04
# CYTOSKELETON 1.475340e-03
# geneID
# SPINDLE 991/9493/9787/22974/983/332/3832/7272/9055/6790/24137
# MICROTUBULE_CYTOSKELETON 991/9493/9133/7153/9787/22974/4751/983/332/3832/7272/9055/6790/24137/4137/7802
# CYTOSKELETAL_PART 991/9493/7153/9787/22974/4751/983/332/3832/7272/9055/6790/24137/4137/7802
# SPINDLE_MICROTUBULE 983/332/3832/9055/24137
# MICROTUBULE 983/332/3832/9055/24137/4137
# CYTOSKELETON 991/9493/9133/7153/9787/22974/4751/983/332/3832/7272/9055/6790/24137/4137/7802
# Count
# SPINDLE 11
# MICROTUBULE_CYTOSKELETON 16
# CYTOSKELETAL_PART 15
# SPINDLE_MICROTUBULE 5
# MICROTUBULE 6
# CYTOSKELETON 16
egmt <- setReadable(egmt, OrgDb=org.Hs.eg.db, keytype="ENTREZID")
head(summary(egmt))
# ID Description
# SPINDLE SPINDLE SPINDLE
# MICROTUBULE_CYTOSKELETON MICROTUBULE_CYTOSKELETON MICROTUBULE_CYTOSKELETON
# CYTOSKELETAL_PART CYTOSKELETAL_PART CYTOSKELETAL_PART
# SPINDLE_MICROTUBULE SPINDLE_MICROTUBULE SPINDLE_MICROTUBULE
# MICROTUBULE MICROTUBULE MICROTUBULE
# CYTOSKELETON CYTOSKELETON CYTOSKELETON
# GeneRatio BgRatio pvalue p.adjust
# SPINDLE 11/82 39/5270 7.667674e-12 6.594200e-10
# MICROTUBULE_CYTOSKELETON 16/82 152/5270 8.449298e-10 3.633198e-08
# CYTOSKELETAL_PART 15/82 235/5270 2.414879e-06 6.623386e-05
# SPINDLE_MICROTUBULE 5/82 16/5270 3.080645e-06 6.623386e-05
# MICROTUBULE 6/82 32/5270 7.740446e-06 1.331357e-04
# CYTOSKELETON 16/82 367/5270 1.308357e-04 1.826293e-03
# qvalue
# SPINDLE 5.327016e-10
# MICROTUBULE_CYTOSKELETON 2.935019e-08
# CYTOSKELETAL_PART 5.350593e-05
# SPINDLE_MICROTUBULE 5.350593e-05
# MICROTUBULE 1.075515e-04
# CYTOSKELETON 1.475340e-03
# geneID
# SPINDLE CDC20/KIF23/DLGAP5/TPX2/CDK1/BIRC5/KIF11/TTK/PRC1/AURKA/KIF4A
# MICROTUBULE_CYTOSKELETON CDC20/KIF23/CCNB2/TOP2A/DLGAP5/TPX2/NEK2/CDK1/BIRC5/KIF11/TTK/PRC1/AURKA/KIF4A/MAPT/DNALI1
# CYTOSKELETAL_PART CDC20/KIF23/TOP2A/DLGAP5/TPX2/NEK2/CDK1/BIRC5/KIF11/TTK/PRC1/AURKA/KIF4A/MAPT/DNALI1
# SPINDLE_MICROTUBULE CDK1/BIRC5/KIF11/PRC1/KIF4A
# MICROTUBULE CDK1/BIRC5/KIF11/PRC1/KIF4A/MAPT
# CYTOSKELETON CDC20/KIF23/CCNB2/TOP2A/DLGAP5/TPX2/NEK2/CDK1/BIRC5/KIF11/TTK/PRC1/AURKA/KIF4A/MAPT/DNALI1
# Count
# SPINDLE 11
# MICROTUBULE_CYTOSKELETON 16
# CYTOSKELETAL_PART 15
# SPINDLE_MICROTUBULE 5
# MICROTUBULE 6
# CYTOSKELETON 16
gsegmt <- GSEA(geneList, TERM2GENE=c5, verbose=F)
head(summary(gsegmt))
# ID
# EXTRACELLULAR_REGION EXTRACELLULAR_REGION
# EXTRACELLULAR_REGION_PART EXTRACELLULAR_REGION_PART
# CELL_PROJECTION CELL_PROJECTION
# PROTEINACEOUS_EXTRACELLULAR_MATRIX PROTEINACEOUS_EXTRACELLULAR_MATRIX
# EXTRACELLULAR_MATRIX EXTRACELLULAR_MATRIX
# EXTRACELLULAR_MATRIX_PART EXTRACELLULAR_MATRIX_PART
# Description
# EXTRACELLULAR_REGION EXTRACELLULAR_REGION
# EXTRACELLULAR_REGION_PART EXTRACELLULAR_REGION_PART
# CELL_PROJECTION CELL_PROJECTION
# PROTEINACEOUS_EXTRACELLULAR_MATRIX PROTEINACEOUS_EXTRACELLULAR_MATRIX
# EXTRACELLULAR_MATRIX EXTRACELLULAR_MATRIX
# EXTRACELLULAR_MATRIX_PART EXTRACELLULAR_MATRIX_PART
# setSize enrichmentScore NES
# EXTRACELLULAR_REGION 401 -0.3860230 -1.694496
# EXTRACELLULAR_REGION_PART 310 -0.4101043 -1.761338
# CELL_PROJECTION 87 -0.4729701 -1.739867
# PROTEINACEOUS_EXTRACELLULAR_MATRIX 93 -0.6355317 -2.365007
# EXTRACELLULAR_MATRIX 95 -0.6229461 -2.318356
# EXTRACELLULAR_MATRIX_PART 54 -0.5908035 -2.002728
# pvalue p.adjust qvalues
# EXTRACELLULAR_REGION 0.001310616 0.03192152 0.02442263
# EXTRACELLULAR_REGION_PART 0.001375516 0.03192152 0.02442263
# CELL_PROJECTION 0.001503759 0.03192152 0.02442263
# PROTEINACEOUS_EXTRACELLULAR_MATRIX 0.001555210 0.03192152 0.02442263
# EXTRACELLULAR_MATRIX 0.001557632 0.03192152 0.02442263
# EXTRACELLULAR_MATRIX_PART 0.001631321 0.03192152 0.02442263
Citation
Yu G, Wang L, Han Y and He Q*. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
这篇文章写于
enrichplot
发布之前,现在出图更友好、更漂亮了,请移步《enrichplot: 让你们对clusterProfiler系列包无法自拔》。